import scanpy as sc
import pandas as pd
import numpy as np
from scipy.stats import gmean
import seaborn as sns
import anndata as ad
import math
import matplotlib.pyplot as plt
import yaml
import rpy2
import holoviews as hv
with open(yaml.full_load("../Paths.yaml")) as f:
Paths = yaml.full_load(f)
baseDataPath = Paths["Paths"]["External"]
nucleiDS = ["TotalSeqA_nuclei", "CMO_nuclei"]
cellsDS = ["TotalSeqC_cells", "TotalSeqA_cells", "LMO_MULTseq_cells", "LMO_custom_cells"]
ds = "CMO_nuclei"
adata = sc.read_h5ad(baseDataPath+"/DataByExp/"+ds+"/"+ds+".h5ad")
adata
AnnData object with n_obs × n_vars = 15356 × 33812
var: 'gene_ids', 'feature_types'
# Here we separate HTO counts from GEX counts
HTOfeatures = adata[:,adata.var["feature_types"] != "Gene Expression"].var_names.tolist()
HTOfeatures
HTOdf = pd.DataFrame(adata[:,HTOfeatures].X.todense(), index=adata.obs_names, columns=["HTO@TAG."+i for i in HTOfeatures])
#HTOdf = np.log1p(HTOdf.divide(HTOdf.apply(gmean, axis=1), axis = 0))
adata.obs = pd.concat([adata.obs,HTOdf], axis = 1)
adata.obs
# Keep only GEX
adata = adata[:,adata.var["feature_types"] == "Gene Expression"]
# Import demultiplexing results
GeneticID = pd.read_csv(baseDataPath+"/DataByExp/"+ds+"/"+ds+"_freemuxlet_MULTI_HTO_GMM_annotations.csv", sep ="\t", index_col=0, usecols=["cell","freemuxlet_BEST.GUESS","freemuxlet_DROPLET.TYPE"])
GeneticID.index = [i+"-1" for i in GeneticID.index.tolist()]
adata.obs = pd.concat([adata.obs,GeneticID], axis = 1).loc[adata.obs_names]
adata.obs["freemuxlet_BEST.GUESS"] = np.where(adata.obs["freemuxlet_DROPLET.TYPE"] == "Singlet",adata.obs["freemuxlet_BEST.GUESS"],"Doublet")
# Standard preprocesing
adata_tmp = adata.copy()
sc.pp.normalize_total(adata_tmp, target_sum=1e4)
sc.pp.log1p(adata_tmp)
sc.pp.highly_variable_genes(adata_tmp, min_mean=0.0125, max_mean=3, min_disp=0.5)
adata_tmp.raw = adata_tmp
# PCA and UMAP
sc.tl.pca(adata_tmp, svd_solver='arpack')
sc.pp.neighbors(adata_tmp, n_neighbors=30, n_pcs=40)
sc.tl.umap(adata_tmp)
2022-11-28 08:54:15.081150: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F AVX512_VNNI FMA To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. 2022-11-28 08:54:17.915052: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. 2022-11-28 08:54:28.208913: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/R/4.2.1/lib/R/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/.singularity.d/libs 2022-11-28 08:54:28.209215: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/R/4.2.1/lib/R/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/.singularity.d/libs 2022-11-28 08:54:28.209242: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
sc.pl.umap(adata_tmp, color=['freemuxlet_BEST.GUESS'], vmin='p1',vmax='p99')
sc.pl.umap(adata_tmp, color=[c for c in adata_tmp.obs.columns if "HTO@TAG." in c], vmin='p1',vmax='p99')
/usr/local/lib/python3.8/dist-packages/scanpy/plotting/_tools/scatterplots.py:392: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored cax = scatter(
for this purpose we export the HTO-only counts into R and launch MULTIseqDemux function
import anndata2ri
import rpy2.rinterface_lib.callbacks
import logging
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)
anndata2ri.activate()
%load_ext rpy2.ipython
htDF = adata.obs[[c for c in adata.obs.columns if "HTO@TAG." in c]]
htDFAdata = ad.AnnData(htDF.to_numpy(), obs=adata.obs)
htDFAdata.var_names = [c for c in adata.obs.columns if "HTO@TAG." in c]
htDFAdata.obs = htDFAdata.obs.drop(columns = [c for c in adata.obs.columns if "HTO@TAG." in c])
%%R -i htDFAdata
library(Seurat)
Sobj <- as.Seurat(htDFAdata, counts = "X", data = NULL)
Sobj
Sobj[["HTO"]] <- CreateAssayObject(counts = GetAssayData(object = Sobj, slot = "counts"))
Sobj <- NormalizeData(Sobj,assay = "HTO", normalization.method = "CLR")
Sobj <- MULTIseqDemux(Sobj, assay = "HTO",autoThresh = TRUE)
WARNING: The R package "reticulate" only fixed recently
an issue that caused a segfault when used with rpy2:
https://github.com/rstudio/reticulate/pull/1188
Make sure that you use a version of that package that includes
the fix.
| | 0 % ~calculating |+++++++++++++ | 25% ~00s |+++++++++++++++++++++++++ | 50% ~00s |++++++++++++++++++++++++++++++++++++++ | 75% ~00s |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=00s
%%R -o TagsID
TagsID <- data.frame(
(lapply(Sobj@meta.data[,c("MULTI_ID","MULTI_classification")], function(x){gsub("-", "_", x)})),
row.names = rownames(Sobj@meta.data)
)
try:
adata.obs = adata.obs.drop(columns="MULTI_ID")
except:
print("MULTI_ID obs not found")
TagsID = TagsID[["MULTI_ID"]]
adata.obs = pd.concat([adata.obs,TagsID], axis = 1)
# Here we map tag name with best freemuxlet overlap correspondent
MapDict = dict(pd.crosstab(adata.obs.loc[~adata.obs["MULTI_ID"].isin(["Negative","Doublet"]),"MULTI_ID"], adata.obs.loc[~adata.obs["MULTI_ID"].isin(["Negative","Doublet"]),"freemuxlet_BEST.GUESS"]).idxmax(axis = 1))
print(MapDict)
adata.obs["MULTI_ID"] = adata.obs["MULTI_ID"].replace(MapDict)+"_HTO"
MULTI_ID obs not found
{'HTO@TAG.Hash_MULTI_1': 'MCF7', 'HTO@TAG.Hash_MULTI_2': 'PC3', 'HTO@TAG.Hash_MULTI_3': 'DU145', 'HTO@TAG.Hash_MULTI_4': 'MDAMB231'}
# Remove cells with less than 200 counts
sc.pp.filter_cells(adata, min_genes=200)
# filter on max mito genes %
adata.var['mt'] = adata.var_names.str.startswith('MT-') # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
jitter=0.4, multi_panel=True)
adata
# MAD-based filtering
MAD = abs(adata.obs["pct_counts_mt"] - adata.obs["pct_counts_mt"].median()).sum() / adata.shape[0]
print("Only cells with pct mito between {} and {} will be kept".format(round(adata.obs["pct_counts_mt"].median() - 2*MAD), round(adata.obs["pct_counts_mt"].median() + 2*MAD)))
print("Ncells: Pre filter {} vs post filter {}".format(adata.shape[0],
adata[(adata.obs["pct_counts_mt"] >= adata.obs["pct_counts_mt"].median() - 2*MAD) & (adata.obs["pct_counts_mt"] <= adata.obs["pct_counts_mt"].median() + 2*MAD)].shape[0]))
adata = adata[(adata.obs["pct_counts_mt"] >= adata.obs["pct_counts_mt"].median() - 2*MAD) & (adata.obs["pct_counts_mt"] <= adata.obs["pct_counts_mt"].median() + 2*MAD)]
Only cells with pct mito between 0 and 0 will be kept Ncells: Pre filter 15356 vs post filter 14109
# Firstwe replace 0 with 1
HTOdf.loc[(HTOdf == 0).values.any(axis = 1)] = HTOdf[(HTOdf == 0).values.any(axis = 1)].apply(lambda row: row.replace({0:1}))
CLR_htos = np.log(HTOdf.divide(HTOdf.apply(gmean, axis=1), axis = 0))
#CLR_htos = CLR_htos.fillna(0)
for col in [c for c in adata.obs.columns if "HTO@TAG." in c]:
adata.obs[col] = CLR_htos[col]
/tmp/ipykernel_20130/2419446567.py:6: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual. adata.obs[col] = CLR_htos[col]
# Standard preprocesing
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
adata.raw = adata
# PCA and UMAP
sc.tl.pca(adata, svd_solver='arpack')
sc.pp.neighbors(adata, n_neighbors=30, n_pcs=40)
sc.tl.umap(adata)
sc.pl.umap(adata, color=['freemuxlet_BEST.GUESS',"MULTI_ID"], vmin='p1',vmax='p99', wspace=.3)
sc.pl.umap(adata, color=[c for c in adata_tmp.obs.columns if "HTO@TAG." in c])
/usr/local/lib/python3.8/dist-packages/scanpy/plotting/_tools/scatterplots.py:392: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored cax = scatter( /usr/local/lib/python3.8/dist-packages/scanpy/plotting/_tools/scatterplots.py:392: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored cax = scatter(
sc.pl.violin(adata, keys=[c for c in adata.obs.columns if "HTO@TAG." in c], groupby='MULTI_ID', rotation=90)
g = sns.FacetGrid(adata.obs[[c for c in adata.obs.columns if "HTO@TAG." in c]].melt(), col="variable",hue="variable",height=3.5, aspect=2, col_wrap=2)
g.map(sns.kdeplot, "value")
<seaborn.axisgrid.FacetGrid at 0x7fbd9e3e72b0>
nrow, ncol = math.ceil(len(adata.obs["MULTI_ID"].unique())/4), 4
axs = [(r,c) for r in range(0, nrow) for c in range(0, ncol) ]
figsize = (ncol*5, nrow*4.7) #(width, height)
#Set axes
fig, ax = plt.subplots(nrows=nrow, ncols=ncol, figsize=figsize)
fig.tight_layout(pad=1, h_pad=15) #space between plots
for i in axs[-(len(adata.obs["MULTI_ID"].unique())%4):]:
fig.delaxes(ax[i])
#Plot
for n,i in enumerate(adata.obs["MULTI_ID"].unique()):
adataID = adata[adata.obs["MULTI_ID"] == i]
labels = adataID.obs["freemuxlet_BEST.GUESS"].value_counts().index.tolist()
sizes = adataID.obs["freemuxlet_BEST.GUESS"].value_counts().tolist()
myexplode = tuple([0.2]*len(labels))
ax[axs[n]].pie(sizes, radius=2,explode=myexplode,labels=labels, autopct='%1.1f%%',
shadow=True, startangle=45,rotatelabels=90 )
ax[axs[n]].axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
ax[axs[n]].set_title(i,pad=50)
obsTupleToMap = ('freemuxlet_BEST.GUESS',"MULTI_ID")
SankeyDF=adata.obs[list(obsTupleToMap)]
SankeyDF.columns = [obsTupleToMap[0],obsTupleToMap[1]]
SankeyDF = SankeyDF.groupby([obsTupleToMap[0],obsTupleToMap[1]]).size().reset_index().rename(columns={0:'count'})
SankeyDF=SankeyDF[SankeyDF["count"] != 0]
hv.extension('bokeh')
sankey1 = hv.Sankey(SankeyDF, kdims=[obsTupleToMap[0],obsTupleToMap[1]], vdims="count")
sankey1.opts(label_position='outer',
edge_color=obsTupleToMap[0], edge_line_width=0,
node_alpha=1.0, node_width=40, node_sort=False,
width=1100, height=700, bgcolor="white")
SingletsDF = adata.obs.loc[(adata.obs["freemuxlet_DROPLET.TYPE"] == "Singlet") & (~adata.obs["MULTI_ID"].isin(["Negative_HTO","Doublet_HTO"])),["MULTI_ID","freemuxlet_BEST.GUESS"]]
SingletsDF.MULTI_ID = SingletsDF.MULTI_ID.str.replace("_HTO", "")
MathingSNGS=(SingletsDF["MULTI_ID"] == SingletsDF["freemuxlet_BEST.GUESS"]).sum()
MathingSNGS
NonSingletsDF = adata.obs.loc[(adata.obs["freemuxlet_DROPLET.TYPE"] != "Singlet") & (adata.obs["MULTI_ID"].isin(["Negative_HTO","Doublet_HTO"])),["MULTI_ID","freemuxlet_BEST.GUESS"]]
NonSingletsDF.MULTI_ID = NonSingletsDF.MULTI_ID.str.replace("_HTO", "")
MathingNonSNGS=(NonSingletsDF["MULTI_ID"] == NonSingletsDF["freemuxlet_BEST.GUESS"]).sum()
MathingNonSNGS
(MathingSNGS+MathingNonSNGS)/adata.shape[0]
0.8373378694450351